Libraries
library(readxl)
library(summarytools)
#devtools::install_github("boxuancui/DataExplorer")
library(DataExplorer)
library(kableExtra)
library(dplyr)
library(reshape2)
library(lubridate)
library(tidyverse)
#library(textshape)
library(magrittr)
library(ggplot2)
library(factoextra)
library(FactoMineR)
library(cluster)
library(clValid)
library(ape)
library(dendextend)
library(clustertend)
library(NbClust)
library(ggiraphExtra)
library(plotly)
Load Data
setwd("/Users/danielesilva/Documents/Data Science Projects/Data_Science/Clustering Spotify")
#load data
#font: http://organizeyourmusic.playlistmachinery.com/
#Variables:
#Genre - the genre of the track
#Year - the release year of the recording. Note that due to vagaries of releases, re-releases, re-issues and general madness, sometimes the release years are not what you'd expect.
#Added - the earliest date you added the track to your collection.
#Beats Per Minute (BPM) - The tempo of the song.
#Energy - The energy of a song - the higher the value, the more energtic. song
#Danceability - The higher the value, the easier it is to dance to this song.
#Loudness (dB) - The higher the value, the louder the song.
#Liveness - The higher the value, the more likely the song is a live recording.
#Valence - The higher the value, the more positive mood for the song.
#Length - The duration of the song.
#Acousticness - The higher the value the more acoustic the song is.
#Speechiness - The higher the value the more spoken word the song contains.
#Popularity - The higher the value the more popular the song is.
#Duration - The length of the song.
data = read_xlsx("my_top20.xlsx")
Exploratory Data
head(data)
## # A tibble: 6 x 13
## N TITLE ARTIST RELEASE BPM ENERGY DANCE LOUD VALENCE
## <dbl> <chr> <chr> <dttm> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Somebody … Gotye 2011-01-01 00:00:00 129 52 87 -7 75
## 2 2 Blinding … The Wee… 2020-03-20 00:00:00 171 73 51 -6 33
## 3 3 Café Vitão 2019-03-08 00:00:00 92 50 78 -5 49
## 4 4 Graveto -… MarÃlia… 2020-01-10 00:00:00 93 63 71 -5 40
## 5 5 Sei Lá Projota 2019-04-25 00:00:00 152 43 66 -9 96
## 6 6 Don't Sta… Dua Lipa 2020-03-27 00:00:00 124 79 79 -5 68
## # … with 4 more variables: LENGTH <dttm>, ACOUSTIC <dbl>, POP. <dbl>, RND <dbl>
summary(data) %>% kable() %>% kable_styling()
|
|
N
|
TITLE
|
ARTIST
|
RELEASE
|
BPM
|
ENERGY
|
DANCE
|
LOUD
|
VALENCE
|
LENGTH
|
ACOUSTIC
|
POP.
|
RND
|
|
|
Min. : 1.00
|
Length:100
|
Length:100
|
Min. :1966-08-05 00:00:00
|
Min. : 80.0
|
Min. :17.00
|
Min. :33.00
|
Min. :-15.00
|
Min. :10.00
|
Min. :1899-12-31 02:00:00
|
Min. : 0.00
|
Min. :30.00
|
Min. : 45
|
|
|
1st Qu.: 25.75
|
Class :character
|
Class :character
|
1st Qu.:2018-02-16 00:00:00
|
1st Qu.:102.8
|
1st Qu.:59.75
|
1st Qu.:59.50
|
1st Qu.: -6.25
|
1st Qu.:49.00
|
1st Qu.:1899-12-31 02:50:45
|
1st Qu.: 7.75
|
1st Qu.:65.75
|
1st Qu.:2526
|
|
|
Median : 50.50
|
Mode :character
|
Mode :character
|
Median :2019-08-16 00:00:00
|
Median :123.5
|
Median :72.00
|
Median :69.00
|
Median : -5.00
|
Median :64.00
|
Median :1899-12-31 03:09:30
|
Median :19.00
|
Median :72.50
|
Median :5306
|
|
|
Mean : 50.50
|
NA
|
NA
|
Mean :2015-09-24 07:12:00
|
Mean :126.0
|
Mean :69.58
|
Mean :67.69
|
Mean : -5.54
|
Mean :62.79
|
Mean :1899-12-31 03:14:28
|
Mean :28.22
|
Mean :72.65
|
Mean :4908
|
|
|
3rd Qu.: 75.25
|
NA
|
NA
|
3rd Qu.:2019-12-13 00:00:00
|
3rd Qu.:146.2
|
3rd Qu.:82.25
|
3rd Qu.:78.00
|
3rd Qu.: -4.00
|
3rd Qu.:80.00
|
3rd Qu.:1899-12-31 03:35:00
|
3rd Qu.:46.25
|
3rd Qu.:83.00
|
3rd Qu.:6728
|
|
|
Max. :100.00
|
NA
|
NA
|
Max. :2020-06-30 00:00:00
|
Max. :196.0
|
Max. :95.00
|
Max. :96.00
|
Max. : 2.00
|
Max. :97.00
|
Max. :1899-12-31 05:04:00
|
Max. :94.00
|
Max. :97.00
|
Max. :9945
|
#glimpse(data) %>% kable() %>% kable_styling()
#st_options(plain.ascii = FALSE)
print(dfSummary(data, graph.magnif = 0.75), method = 'render')
#DataExplorer
plot_str(data)
plot_missing(data)

plot_histogram(data)

plot_correlation(data, type = 'continuous')

#plot_bar(data)
#plot_boxplot(data)
Tratamento de Dados
names(data) = make.names(names(data))
names(data)
## [1] "N" "TITLE" "ARTIST" "RELEASE" "BPM" "ENERGY"
## [7] "DANCE" "LOUD" "VALENCE" "LENGTH" "ACOUSTIC" "POP."
## [13] "RND"
#remove duplicates
data_numeric = data %>%
unique() %>%
#transform lenght
mutate(DURATION = ms(format(as.POSIXct(strptime(LENGTH, "%Y-%m-%d %H:%M:%S",tz="")), format= "%H:%M")))%>%
#keep just numeric variables
select(-N,-TITLE,-ARTIST,-RELEASE,-RND,-POP.,-LENGTH)
#create y for vtreat
# mutate(y = rowSums(across(where(is.numeric))))
str(data_numeric)
## tibble [100 × 7] (S3: tbl_df/tbl/data.frame)
## $ BPM : num [1:100] 129 171 92 93 152 124 98 138 130 95 ...
## $ ENERGY : num [1:100] 52 73 50 63 43 79 59 28 48 82 ...
## $ DANCE : num [1:100] 87 51 78 71 66 79 82 58 73 55 ...
## $ LOUD : num [1:100] -7 -6 -5 -5 -9 -5 -6 -9 -7 -4 ...
## $ VALENCE : num [1:100] 75 33 49 40 96 68 51 81 50 56 ...
## $ ACOUSTIC: num [1:100] 55 0 5 29 52 1 69 94 57 12 ...
## $ DURATION:Formal class 'Period' [package "lubridate"] with 6 slots
## .. ..@ .Data : num [1:100] 5 20 10 51 10 3 29 7 59 54 ...
## .. ..@ year : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
## .. ..@ month : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
## .. ..@ day : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
## .. ..@ hour : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
## .. ..@ minute: num [1:100] 4 3 3 2 3 3 3 2 2 2 ...
#scale data
data_scale = as.data.frame(scale(data_numeric))
ggplot(stack(data_scale), aes(x=ind, y=values)) + geom_boxplot(aes(fill=ind))

#outliers in energy and loud
#Title to rownames
rownames(data_scale) = data$TITLE
Choose the optimal number of cluster
set.seed(1990)
#os dados são clusterizaveis?
hopkins(data_scale, n = nrow(data_scale)-1)
## $H
## [1] 0.3868532
res.pca <- PCA(data_scale, graph = FALSE)
# Visualize
fviz_screeplot(res.pca, addlabels = TRUE, ylim = c(0, 50))

# results for variables
var <- get_pca_var(res.pca)
# Contributions of variables to PC1
fviz_contrib(res.pca, choice = "var", axes = 1, top = 10)

# Contributions of variables to PC2
fviz_contrib(res.pca, choice = "var", axes = 2, top = 10)

# Control variable colors using their contributions to the principle axis
fviz_pca_var(res.pca, col.var="contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
) + theme_minimal() + ggtitle("Variables - PCA")

# 1 - Elbow
fviz_nbclust(data_scale, kmeans, method = "wss", k.max = 24) + theme_minimal() + ggtitle("the Elbow Method")

# 2 - Gap
gap_stat <- clusGap(data_scale, FUN = kmeans, nstart = 30, K.max = 24, B = 50)
fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("fviz_gap_stat: Gap Statistic")

# 3 - Silhouette
fviz_nbclust(data_scale, kmeans, method = "silhouette", k.max = 24) + theme_minimal() + ggtitle("The Silhouette Plot")

# 4 - Sum of Squares
#The above example would group the data into two clusters, centers = 2, and attempt #multiple initial configurations, reporting on the best one. For example, as this #algorithm is sensitive to the initial positions of the cluster centroids adding nstart #= 30 will generate 30 initial configurations and then average all the centroid #results.
km2 <- kmeans(data_scale, 2, nstart = 30)
km3 <- kmeans(data_scale, 3, nstart = 30)
km4 <- kmeans(data_scale, 4)
km5 <- kmeans(data_scale, 5)
km6 <- kmeans(data_scale, 6)
km7 <- kmeans(data_scale, 7)
km8 <- kmeans(data_scale, 8)
km9 <- kmeans(data_scale, 9)
km10 <- kmeans(data_scale, 10)
ssc <- data.frame(
kmeans = c(2,3,4,5,6,7,8,9,10),
within_ss = c(mean(km2$withinss), mean(km3$withinss), mean(km4$withinss), mean(km5$withinss), mean(km6$withinss), mean(km7$withinss), mean(km8$withinss), mean(km9$withinss), mean(km10$withinss)),
between_ss = c(km2$betweenss, km3$betweenss, km4$betweenss, km5$betweenss, km6$betweenss, km7$betweenss, km8$betweenss, km9$betweenss, km10$betweenss)
)
ssc %<>% gather(., key = "measurement", value = value, -kmeans)
ssc %>% ggplot(., aes(x=kmeans, y=log10(value), fill = measurement)) + geom_bar(stat = "identity", position = "dodge") + ggtitle("Cluster Model Comparison") + xlab("Number of Clusters") + ylab("Log10 Total Sum of Squares") + scale_x_discrete(name = "Number of Clusters", limits = c("0", "2", "3", "4", "5", "6", "7", "8", "9", "10"))

# 5 - NbClust
res.nbclust <- NbClust(data_scale, distance = "euclidean",
min.nc = 2, max.nc = 9,
method = "complete", index ="all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 7 proposed 2 as the best number of clusters
## * 4 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 2 proposed 5 as the best number of clusters
## * 7 proposed 6 as the best number of clusters
## * 2 proposed 9 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
factoextra::fviz_nbclust(res.nbclust) + theme_minimal() + ggtitle("NbClust's optimal number of clusters")
## Warning in if (class(best_nc) == "numeric") print(best_nc) else if
## (class(best_nc) == : the condition has length > 1 and only the first element
## will be used
## Warning in if (class(best_nc) == "matrix") .viz_NbClust(x, print.summary, : the
## condition has length > 1 and only the first element will be used
## Warning in if (class(best_nc) == "numeric") print(best_nc) else if
## (class(best_nc) == : the condition has length > 1 and only the first element
## will be used
## Warning in if (class(best_nc) == "matrix") {: the condition has length > 1 and
## only the first element will be used
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 1 proposed 1 as the best number of clusters
## * 7 proposed 2 as the best number of clusters
## * 4 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 2 proposed 5 as the best number of clusters
## * 7 proposed 6 as the best number of clusters
## * 2 proposed 9 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 2 .

Clustering
intern <- clValid(data_scale, nClust = 2:24,
clMethods = c("hierarchical","kmeans","pam","clara"), validation = "internal")
# Summary
summary(intern) %>% kable() %>% kable_styling()
##
## Clustering Methods:
## hierarchical kmeans pam clara
##
## Cluster sizes:
## 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
##
## Validation Measures:
## 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
##
## hierarchical Connectivity 2.9290 14.9460 19.0956 22.2385 36.2270 38.7056 54.7960 69.6940 72.1202 73.4536 87.0024 91.6425 94.1131 98.5024 100.9813 102.8948 104.6071 113.9536 116.3798 118.4131 119.2004 121.4599 122.9460
## Dunn 0.3282 0.2925 0.2962 0.2962 0.2687 0.2687 0.2315 0.2022 0.2022 0.2022 0.2481 0.2481 0.2611 0.2611 0.2646 0.2646 0.2646 0.3362 0.3362 0.3362 0.3362 0.3362 0.3362
## Silhouette 0.3055 0.2607 0.2219 0.2054 0.1563 0.1354 0.1495 0.1360 0.1228 0.1177 0.1485 0.1523 0.1273 0.1233 0.1298 0.1327 0.1271 0.1551 0.1476 0.1444 0.1462 0.1448 0.1442
## kmeans Connectivity 28.0599 56.5659 70.6885 78.1079 83.8242 81.9790 91.8079 98.8040 101.2929 103.1262 113.5730 114.8782 112.1492 113.7115 117.8000 122.4063 128.5587 133.7075 137.0837 141.1226 146.1909 149.8587 147.7857
## Dunn 0.2239 0.1438 0.1480 0.1711 0.2130 0.2163 0.2175 0.2246 0.2246 0.2246 0.2162 0.2162 0.2725 0.2709 0.2709 0.2723 0.2378 0.2828 0.2828 0.2828 0.2838 0.2838 0.2850
## Silhouette 0.2401 0.1449 0.1716 0.1598 0.1604 0.1718 0.1740 0.1857 0.1807 0.1740 0.1673 0.1661 0.1699 0.1746 0.1730 0.1690 0.1598 0.1631 0.1617 0.1580 0.1619 0.1576 0.1727
## pam Connectivity 52.8937 67.2933 101.6187 93.7992 108.5433 105.4698 106.6952 110.8313 112.4298 114.7060 115.1587 119.2746 124.5937 126.9611 130.3425 131.8520 135.5048 140.7603 143.1270 147.2710 154.3119 158.4560 152.8210
## Dunn 0.1290 0.1342 0.1562 0.1480 0.1279 0.1279 0.1533 0.1406 0.1502 0.1673 0.1673 0.1673 0.1673 0.1949 0.1949 0.2081 0.2081 0.2081 0.2081 0.2081 0.2111 0.2424 0.2111
## Silhouette 0.1946 0.1346 0.0980 0.1273 0.1247 0.1116 0.1424 0.1232 0.1221 0.1206 0.1300 0.1278 0.1247 0.1261 0.1327 0.1368 0.1383 0.1409 0.1397 0.1391 0.1337 0.1330 0.1427
## clara Connectivity 63.3198 66.9361 92.1540 109.1377 88.2468 100.7560 111.7020 112.4694 121.3897 124.5119 125.5841 119.7817 137.0821 127.9401 144.2286 134.6952 147.3790 144.5032 144.0798 151.0274 155.8020 151.0313 154.0948
## Dunn 0.0877 0.1436 0.1201 0.1323 0.1466 0.1325 0.1343 0.1423 0.1543 0.1817 0.1963 0.2184 0.2268 0.1997 0.1947 0.1982 0.1947 0.2081 0.2111 0.2666 0.2080 0.2666 0.1920
## Silhouette 0.1510 0.1360 0.0970 0.1064 0.1222 0.1211 0.1260 0.1326 0.1191 0.1215 0.0948 0.1177 0.1209 0.1232 0.1208 0.1187 0.1171 0.1225 0.1281 0.1306 0.1158 0.1139 0.1394
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 2.9290 hierarchical 2
## Dunn 0.3362 hierarchical 19
## Silhouette 0.3055 hierarchical 2
# Dendogram
d <- dist(data_scale, method = "euclidean")
res.hc <- hclust(d, method = "ward.D2" )
# Cut tree into 6 groups
grp <- cutree(res.hc, k = 6)
# Visualize
#plot(res.hc, cex = 0.5) # plot tree
# Color labels by specifying the number of cluster (k)
colors = RColorBrewer::brewer.pal(6,"Set2")
dend = as.dendrogram(res.hc)
labels_cex(dend) = 0.5
dend %>% set("labels_col", value = colors, k=6) %>%
plot(main = "Color labels \nper cluster", cex = 1)

plot(as.phylo(res.hc), type = "fan", tip.color = colors[grp],
label.offset = 1, cex = 0.5)

#clara
# Execution of k-means with k=5
clara_clust <- clara(data_scale, 6, samples = 100)
fviz_cluster(clara_clust, data = data_scale, ellipse.type = "norm", geom = "text",
labelsize = 8, palette = colors, show.clust.cent = T) + theme_minimal() +
ggtitle("k = 6")

Resultados
data_clusters = as.data.frame(data) %>%
mutate(Cluster = as.factor(grp)) %>%
mutate(DURATION = ms(format(as.POSIXct(strptime(LENGTH, "%Y-%m-%d %H:%M:%S",tz="")), format= "%H:%M")))
aggregate(data = data_clusters[,c(5:11,14:15)], . ~ Cluster, mean) %>% kable() %>% kable_styling()
|
Cluster
|
BPM
|
ENERGY
|
DANCE
|
LOUD
|
VALENCE
|
LENGTH
|
ACOUSTIC
|
DURATION
|
|
1
|
118.8571
|
49.00000
|
74.00000
|
-7.571429
|
76.85714
|
-2209063449
|
63.285714
|
15.857143
|
|
2
|
134.5625
|
76.75000
|
47.00000
|
-5.125000
|
39.25000
|
-2209063365
|
33.750000
|
28.500000
|
|
3
|
113.3077
|
72.69231
|
75.53846
|
-5.230769
|
65.61538
|
-2209063329
|
9.307692
|
8.615385
|
|
4
|
102.9545
|
69.09091
|
70.95455
|
-5.590909
|
55.50000
|
-2209063091
|
24.181818
|
40.909091
|
|
5
|
144.1944
|
76.36111
|
73.63889
|
-4.500000
|
79.38889
|
-2209064387
|
23.416667
|
35.222222
|
|
6
|
113.6667
|
28.83333
|
50.83333
|
-11.000000
|
30.16667
|
-2209061010
|
57.166667
|
16.500000
|
data_df <- as.data.frame(data_scale) %>% rownames_to_column()
cluster_pos <- as.data.frame(grp) %>% rownames_to_column()
colnames(cluster_pos) <- c("rowname", "cluster")
data_scale_cl <- inner_join(cluster_pos, data_df)
## Joining, by = "rowname"
ggRadar(data_scale_cl[-1], aes(group = cluster), rescale = FALSE,
legend.position = "none", size = 1, interactive = FALSE, use.label = TRUE) +
facet_wrap(~cluster) + scale_y_discrete(breaks = NULL) + # don't show ticks
theme(axis.text.x = element_text(size = 10)) +
scale_fill_manual(values = rep(colors, nrow(data_scale_cl))) +
scale_color_manual(values = rep(colors, nrow(data_scale_cl))) +
ggtitle("Music Attributes per Cluster")

#arrange data for plots
data_plot = melt(data_clusters, id.vars = c("N","Cluster"),
measure.vars = c("BPM", "ENERGY", "DANCE", "LOUD", "VALENCE",
"ACOUSTIC"),
variable.name = "VAR",
value.name = "Value")
p_ALL= ggplot(data = data_plot, aes(x=Value, fill=Cluster)) +
geom_density(alpha=0.5) +
scale_fill_brewer(palette = "Set2") +
facet_wrap(. ~ VAR, scales = "free")
subplot(ggplotly(p_ALL))